single-cell-analysis-Scanpy-COVID 19 PMBC¶
In [195]:
## scRNA-seq anlysis using Scanpy for six PBMC samples
In [196]:
# pip install scanpy
In [197]:
## Load Libraries
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib as plt
import seaborn as sns
In [198]:
# setting verbosity
sc.settings.verbosity = 3
Loading data¶
In [199]:
# read data files
## Also making var names unique
data_cov1 = sc.read_10x_h5('./data/raw/nCoV_PBMC_1.h5')
data_cov1.var_names_make_unique()
data_cov15 = sc.read_10x_h5('./data/raw/nCoV_PBMC_15.h5')
data_cov15.var_names_make_unique()
data_cov17 = sc.read_10x_h5('./data/raw/nCoV_PBMC_17.h5')
data_cov17.var_names_make_unique()
data_ctrl5 = sc.read_10x_h5('./data/raw/Normal_PBMC_5.h5')
data_ctrl5.var_names_make_unique()
data_ctrl13 = sc.read_10x_h5('./data/raw/Normal_PBMC_13.h5')
data_ctrl13.var_names_make_unique()
data_ctrl14 = sc.read_10x_h5('./data/raw/Normal_PBMC_14.h5')
data_ctrl14.var_names_make_unique()
reading ./data/raw/nCoV_PBMC_1.h5
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/anndata/_core/anndata.py:1756: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
(0:00:00) reading ./data/raw/nCoV_PBMC_15.h5
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/anndata/_core/anndata.py:1756: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
(0:00:00)
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/anndata/_core/anndata.py:1756: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/anndata/_core/anndata.py:1756: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
reading ./data/raw/nCoV_PBMC_17.h5 (0:00:00) reading ./data/raw/Normal_PBMC_5.h5 (0:00:00)
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/anndata/_core/anndata.py:1756: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/anndata/_core/anndata.py:1756: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/anndata/_core/anndata.py:1756: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/anndata/_core/anndata.py:1756: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
reading ./data/raw/Normal_PBMC_13.h5 (0:00:00) reading ./data/raw/Normal_PBMC_14.h5 (0:00:00)
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/anndata/_core/anndata.py:1756: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/anndata/_core/anndata.py:1756: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/anndata/_core/anndata.py:1756: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/anndata/_core/anndata.py:1756: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
In [200]:
# Creating merged object from samples
data_cov1.obs['type']="Covid"
data_cov1.obs['sample']="covid_1"
data_cov15.obs['type']="Covid"
data_cov15.obs['sample']="covid_15"
data_cov17.obs['type']="Covid"
data_cov17.obs['sample']="covid_17"
data_ctrl5.obs['type']="Ctrl"
data_ctrl5.obs['sample']="ctrl_5"
data_ctrl13.obs['type']="Ctrl"
data_ctrl13.obs['sample']="ctrl_13"
data_ctrl14.obs['type']="Ctrl"
data_ctrl14.obs['sample']="ctrl_14"
In [201]:
# Creating Ann data object for further Analysis
adata = data_cov1.concatenate( data_cov15, data_cov17, data_ctrl5, data_ctrl14, data_ctrl13 )
/var/folders/mn/jc4_tkl910gcsv_h9ddxdzqr0000gn/T/ipykernel_82400/2714589487.py:3: FutureWarning: Use anndata.concat instead of AnnData.concatenate, AnnData.concatenate is deprecated and will be removed in the future. See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html adata = data_cov1.concatenate( data_cov15, data_cov17, data_ctrl5, data_ctrl14, data_ctrl13 )
In [202]:
# delete individual datasets to save space
del(data_cov1, data_cov15, data_cov17)
del(data_ctrl5, data_ctrl13, data_ctrl14)
In [203]:
## data
adata
Out[203]:
AnnData object with n_obs × n_vars = 9000 × 33538
obs: 'type', 'sample', 'batch'
var: 'gene_ids', 'feature_types', 'genome'
In [204]:
# values
adata.obs['sample'].value_counts()
Out[204]:
sample covid_1 1500 covid_15 1500 covid_17 1500 ctrl_5 1500 ctrl_14 1500 ctrl_13 1500 Name: count, dtype: int64
In [205]:
## Quality Control
# Mitochondrial Genes
adata.var['mt'] = adata.var_names.str.startswith('MT-')
# Ribosomal Genes
adata.var['ribo'] = adata.var_names.str.startswith(("RPS","RBL"))
# Hemoglobin genes
adata.var['hb'] = adata.var_names.str.contains(("^HB[^(P)]"))
adata.var
Out[205]:
| gene_ids | feature_types | genome | mt | ribo | hb | |
|---|---|---|---|---|---|---|
| MIR1302-2HG | ENSG00000243485 | Gene Expression | GRCh38 | False | False | False |
| FAM138A | ENSG00000237613 | Gene Expression | GRCh38 | False | False | False |
| OR4F5 | ENSG00000186092 | Gene Expression | GRCh38 | False | False | False |
| AL627309.1 | ENSG00000238009 | Gene Expression | GRCh38 | False | False | False |
| AL627309.3 | ENSG00000239945 | Gene Expression | GRCh38 | False | False | False |
| ... | ... | ... | ... | ... | ... | ... |
| AC233755.2 | ENSG00000277856 | Gene Expression | GRCh38 | False | False | False |
| AC233755.1 | ENSG00000275063 | Gene Expression | GRCh38 | False | False | False |
| AC240274.1 | ENSG00000271254 | Gene Expression | GRCh38 | False | False | False |
| AC213203.1 | ENSG00000277475 | Gene Expression | GRCh38 | False | False | False |
| FAM231C | ENSG00000268674 | Gene Expression | GRCh38 | False | False | False |
33538 rows × 6 columns
In [206]:
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt', 'ribo', 'hb'], percent_top=None, log1p=False, inplace=True)
In [207]:
mito_genes = adata.var_names.str.startswith('Mt-')
In [208]:
adata.obs['percent_mt2'] = np.sum(
adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
In [209]:
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
adata
Out[209]:
AnnData object with n_obs × n_vars = 9000 × 33538
obs: 'type', 'sample', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2', 'n_counts'
var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
In [210]:
# Plotting Qc Metrics
# Violin Plot
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb'], jitter = 0.4, groupby='sample', rotation = 45)
In [211]:
# Scatter Plot of the total counts
sc.pl.scatter(adata, x = 'total_counts', y = 'pct_counts_mt', color = 'sample')
In [212]:
sc.pl.scatter(adata, x='n_genes_by_counts', y='pct_counts_ribo', color='sample')
In [213]:
# Filtering
## filtering cells that min 200 genes
sc.pp.filter_cells(adata, min_genes = 200)
## filtering genes that are in atleast 3 cells
sc.pp.filter_genes(adata, min_cells = 3)
print(adata.n_obs, adata.n_vars)
filtered out 1021 cells that have less than 200 genes expressed filtered out 14760 genes that are detected in less than 3 cells 7979 18778
In [214]:
sc.pl.highest_expr_genes(adata, n_top=20)
normalizing counts per cell
finished (0:00:00)
In [215]:
## Mitchondrial and Ribosomal Filtering
adata = adata[adata.obs['pct_counts_mt'] < 20, :]
adata = adata[adata.obs['pct_counts_ribo'] > 5, :]
print("Remaining cells %d"%adata.n_obs)
Remaining cells 5048
In [216]:
### Plotting Filtered QC metrics
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb'],
jitter=0.4, groupby = 'sample', rotation =45)
In [217]:
# Filter based on genes
## MALAT1
malat1 = adata.var_names.str.startswith('MALAT1')
In [218]:
mito_genes = adata.var_names.str.startswith('MT-')
In [219]:
hb_genes = adata.var_names.str.contains('^HB[^(P)]')
In [220]:
remove = np.add(mito_genes, malat1)
remove = np.add(remove, hb_genes)
keep = np.invert(remove)
adata = adata[:,keep]
print(adata.n_obs, adata.n_vars)
5048 18752
In [221]:
## Sampling
annot = sc.queries.biomart_annotations(
"hsapiens",
["ensembl_gene_id", "external_gene_name", "start_position", "end_position", "chromosome_name"],
).set_index("external_gene_name")
In [222]:
chrY_genes = adata.var_names.intersection(annot.index[annot.chromosome_name == "Y"])
chrY_genes
Out[222]:
Index(['PLCXD1', 'GTPBP6', 'LINC00685', 'PPP2R3B', 'CRLF2', 'CSF2RA', 'IL3RA',
'SLC25A6', 'LINC00106', 'ASMTL-AS1', 'ASMTL', 'P2RY8', 'AKAP17A',
'ASMT', 'DHRSX', 'ZBED1', 'LINC00102', 'CD99', 'VAMP7', 'IL9R',
'RPS4Y1', 'ZFY', 'ZFY-AS1', 'LINC00278', 'PCDH11Y', 'USP9Y', 'DDX3Y',
'UTY', 'TMSB4Y', 'NLGN4Y', 'TTTY14', 'KDM5D', 'EIF1AY', 'RPS4Y2'],
dtype='object')
In [223]:
adata.obs['percent_chrY'] = np.sum(
adata[:, chrY_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1 * 100
/var/folders/mn/jc4_tkl910gcsv_h9ddxdzqr0000gn/T/ipykernel_82400/4037913768.py:1: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual. adata.obs['percent_chrY'] = np.sum(
In [224]:
adata.obs["XIST-counts"] = adata.X[:,adata.var_names.str.match('XIST')].toarray()
sc.pl.scatter(adata, x='XIST-counts', y='percent_chrY', color="sample")
In [225]:
sc.pl.violin(adata, ["XIST-counts", "percent_chrY"], jitter=0.4, groupby = 'sample', rotation= 45)
Normalized DATA¶
In [226]:
# save normalized counts in raw slot.
adata.raw = adata
# normalize to depth 10 000
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
# logaritmize
sc.pp.log1p(adata)
# scale
sc.pp.scale(adata)
normalizing by total count per cell
finished (0:00:00): normalized adata.X and added
'n_counts', counts per cell before normalization (adata.obs)
... as `zero_center=True`, sparse input is densified and may lead to large memory consumption
Predicting Doublets¶
In [227]:
import scrublet as scr
In [228]:
scrub = scr.Scrublet(adata.raw.X)
adata.obs['doublet_scores'], adata.obs['predicted_doublets'] = scrub.scrub_doublets()
scrub.plot_histogram()
sum(adata.obs['predicted_doublets'])
Preprocessing... Simulating doublets... Embedding transcriptomes using PCA... Calculating doublet scores... Automatically set threshold at doublet score = 0.40 Detected doublet rate = 3.2% Estimated detectable doublet fraction = 46.9% Overall doublet rate: Expected = 10.0% Estimated = 6.8% Elapsed time: 6.3 seconds
Out[228]:
160
In [229]:
adata.obs['doublet_info'] = adata.obs["predicted_doublets"].astype(str)
In [230]:
sc.pl.violin(adata, 'n_genes_by_counts',
jitter=0.4, groupby = 'doublet_info', rotation=45)
In [231]:
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata = adata[:, adata.var.highly_variable]
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata)
extracting highly variable genes
finished (0:00:03)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
regressing out ['total_counts', 'pct_counts_mt']
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/scanpy/preprocessing/_simple.py:668: UserWarning: Received a view of an AnnData. Making a copy. view_to_actual(adata)
finished (0:00:09)
computing PCA
with n_comps=50
finished (0:00:02)
computing neighbors
using 'X_pca' with n_pcs = 40
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm)
'umap', UMAP parameters (adata.uns) (0:00:07)
In [232]:
sc.pl.umap(adata, color=['doublet_scores','doublet_info','sample'])
In [233]:
adata = adata.raw.to_adata()
adata = adata[adata.obs['doublet_info'] == 'False',:]
print(adata.shape)
(4888, 18752)
In [234]:
import os
os.makedirs('data/results/', exist_ok=True)
save_file = 'data/results/scanpy_qc_filtered_covid.h5ad'
adata.write_h5ad(save_file)
DIMENSIONALITY REDUCTION¶
In [235]:
sc.settings.set_figure_params(dpi=80)
In [236]:
adata = sc.read_h5ad('data/results/scanpy_qc_filtered_covid.h5ad')
In [237]:
adata
Out[237]:
AnnData object with n_obs × n_vars = 4888 × 18752
obs: 'type', 'sample', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2', 'n_counts', 'n_genes', 'percent_chrY', 'XIST-counts', 'doublet_scores', 'predicted_doublets', 'doublet_info'
var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'
uns: 'doublet_info_colors', 'hvg', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
obsp: 'connectivities', 'distances'
In [238]:
# normalize to depth 10 000
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
# logaritmize
sc.pp.log1p(adata)
# store normalized counts in the raw slot,
# will subset adata.X for variable genes, but want to keep all genes matrix as well.
adata.raw = adata
adata
normalizing by total count per cell
finished (0:00:00): normalized adata.X and added
'n_counts', counts per cell before normalization (adata.obs)
WARNING: adata.X seems to be already log-transformed.
Out[238]:
AnnData object with n_obs × n_vars = 4888 × 18752
obs: 'type', 'sample', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2', 'n_counts', 'n_genes', 'percent_chrY', 'XIST-counts', 'doublet_scores', 'predicted_doublets', 'doublet_info'
var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'
uns: 'doublet_info_colors', 'hvg', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
obsp: 'connectivities', 'distances'
In [239]:
# compute variable genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
print("Highly variable genes: %d"%sum(adata.var.highly_variable))
# plot variable genes
sc.pl.highly_variable_genes(adata)
#subset for variable genes in dataset
adata = adata[:, adata.var['highly_variable']]
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
Highly variable genes: 3181
Z-score transformation¶
In [240]:
# regress out
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
# scale data
sc.pp.scale(adata, max_value=10)
regressing out ['total_counts', 'pct_counts_mt']
sparse input is densified and may lead to high memory use
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/scanpy/preprocessing/_simple.py:668: UserWarning: Received a view of an AnnData. Making a copy. view_to_actual(adata)
finished (0:00:10)
PCA¶
In [241]:
sc.tl.pca(adata, svd_solver='arpack')
computing PCA
with n_comps=50
finished (0:00:01)
In [242]:
sc.pl.pca(adata, color='sample', components = ['1,2', '3,4', '5,6', '7,8'], ncols=2)
In [243]:
# plot loading
sc.pl.pca_loadings(adata, components=[1,2,3,4,5,6,7,8])
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/IPython/core/pylabtools.py:170: UserWarning: Glyph 8942 (\N{VERTICAL ELLIPSIS}) missing from font(s) Arial.
fig.canvas.print_figure(bytes_io, **kw)
In [244]:
genes = adata.var['gene_ids']
for pc in [1,2,3,4]:
g = adata.varm['PCs'][:,pc-1]
o = np.argsort(g)
sel = np.concatenate((o[:10],o[-10:])).tolist()
emb = adata.obsm['X_pca'][:,pc-1]
# order by position on that pc
tempdata = adata[np.argsort(emb),]
sc.pl.heatmap(tempdata, var_names = genes[sel].index.tolist(), groupby='predicted_doublets', swap_axes = True, use_raw=False)
/var/folders/mn/jc4_tkl910gcsv_h9ddxdzqr0000gn/T/ipykernel_82400/3760547985.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` sc.pl.heatmap(tempdata, var_names = genes[sel].index.tolist(), groupby='predicted_doublets', swap_axes = True, use_raw=False)
/var/folders/mn/jc4_tkl910gcsv_h9ddxdzqr0000gn/T/ipykernel_82400/3760547985.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` sc.pl.heatmap(tempdata, var_names = genes[sel].index.tolist(), groupby='predicted_doublets', swap_axes = True, use_raw=False)
/var/folders/mn/jc4_tkl910gcsv_h9ddxdzqr0000gn/T/ipykernel_82400/3760547985.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` sc.pl.heatmap(tempdata, var_names = genes[sel].index.tolist(), groupby='predicted_doublets', swap_axes = True, use_raw=False)
/var/folders/mn/jc4_tkl910gcsv_h9ddxdzqr0000gn/T/ipykernel_82400/3760547985.py:10: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` sc.pl.heatmap(tempdata, var_names = genes[sel].index.tolist(), groupby='predicted_doublets', swap_axes = True, use_raw=False)
In [245]:
# variance ration
sc.pl.pca_variance_ratio(adata, log=True, n_pcs = 50)
tSNE¶
In [246]:
sc.tl.tsne(adata, n_pcs = 30)
computing tSNE
using 'X_pca' with n_pcs = 30
using sklearn.manifold.TSNE
finished: added
'X_tsne', tSNE coordinates (adata.obsm)
'tsne', tSNE parameters (adata.uns) (0:00:09)
In [247]:
sc.pl.tsne(adata, color='sample')
UMAP¶
In [248]:
## CALCULATING NEIGHBOURHOOD GRAPH
sc.pp.neighbors(adata, n_pcs = 30, n_neighbors = 20)
computing neighbors
using 'X_pca' with n_pcs = 30
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
In [249]:
sc.tl.umap(adata)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm)
'umap', UMAP parameters (adata.uns) (0:00:08)
In [250]:
sc.pl.umap(adata, color='sample')
In [251]:
# for 10 components
umap10 = sc.tl.umap(adata, n_components=10, copy=True)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm)
'umap', UMAP parameters (adata.uns) (0:00:09)
In [252]:
# plot the umap with neighbor edges
sc.pl.umap(adata, color='sample', title="UMAP", edges=True)
In [253]:
## PLOTS FOR GENE OF INTEREST
sc.pl.umap(adata, color=["CD3E", "CD4", "CD8A", "GNLY","NKG7", "MS4A1","CD14","LYZ","CST3","MS4A7","FCGR3A"])
In [254]:
genes = ["CD3E", "CD4", "CD8A", "GNLY","NKG7", "MS4A1","CD14","LYZ","CST3","MS4A7","FCGR3A"]
var_genes = adata.var.highly_variable
var_genes.index[var_genes]
varg = [x for x in genes if x in var_genes.index[var_genes]]
sc.pl.umap(adata, color=varg, use_raw=False)
In [255]:
## Writing H5ad file to save data
adata.write_h5ad('./data/results/scanpy_dr_covid.h5ad')
In [256]:
print(adata.X.shape)
print(adata.raw.X.shape)
(4888, 3181) (4888, 18752)
In [257]:
# Load the stored data object
save_file = './data/results/scanpy_dr_covid.h5ad'
adata = sc.read_h5ad(save_file)
In [258]:
print(adata.X.shape)
(4888, 3181)
In [259]:
adata2 = adata.raw.to_adata()
# check that the matrix looks like noramlized counts
print(adata2.X[1:10,1:10])
<Compressed Sparse Row sparse matrix of dtype 'float64' with 3 stored elements and shape (9, 9)> Coords Values (0, 5) 0.9678402572038912 (1, 5) 0.5124039238100646 (7, 0) 0.735933317095064
Detect variable genes¶
In [260]:
var_genes_all = adata.var.highly_variable
print("Highly variable genes: %d"%sum(var_genes_all))
Highly variable genes: 3181
In [261]:
sc.pp.log1p(adata2)
WARNING: adata.X seems to be already log-transformed.
In [262]:
sc.pp.highly_variable_genes(adata2, min_mean=0.0125, max_mean=3, min_disp=0.5, batch_key = 'sample')
print("Highly variable genes intersection: %d"%sum(adata2.var.highly_variable_intersection))
print("Number of batches where gene is variable:")
print(adata2.var.highly_variable_nbatches.value_counts())
var_genes_batch = adata2.var.highly_variable_nbatches > 0
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
Highly variable genes intersection: 412
Number of batches where gene is variable:
highly_variable_nbatches
0 12065
1 3093
2 1570
3 786
4 483
6 412
5 343
Name: count, dtype: int64
In [263]:
print("Any batch var genes: %d"%sum(var_genes_batch))
print("All data var genes: %d"%sum(var_genes_all))
print("Overlap: %d"%sum(var_genes_batch & var_genes_all))
print("Variable genes in all batches: %d"%sum(adata2.var.highly_variable_nbatches == 6))
print("Overlap batch instersection and all: %d"%sum(var_genes_all & adata2.var.highly_variable_intersection))
Any batch var genes: 6687 All data var genes: 3181 Overlap: 2662 Variable genes in all batches: 412 Overlap batch instersection and all: 354
In [264]:
var_select = adata2.var.highly_variable_nbatches > 2
var_genes = var_select.index[var_select]
len(var_genes)
Out[264]:
2024
Data Integration¶
COmbat¶
In [265]:
# split per batch into new objects.
batches = adata.obs['sample'].cat.categories.tolist()
alldata = {}
for batch in batches:
alldata[batch] = adata2[adata2.obs['sample'] == batch,]
alldata
Out[265]:
{'covid_1': View of AnnData object with n_obs × n_vars = 740 × 18752
obs: 'type', 'sample', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2', 'n_counts', 'n_genes', 'percent_chrY', 'XIST-counts', 'doublet_scores', 'predicted_doublets', 'doublet_info'
var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'
uns: 'doublet_info_colors', 'hvg', 'log1p', 'neighbors', 'pca', 'sample_colors', 'tsne', 'umap'
obsm: 'X_pca', 'X_tsne', 'X_umap'
obsp: 'connectivities', 'distances',
'covid_15': View of AnnData object with n_obs × n_vars = 414 × 18752
obs: 'type', 'sample', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2', 'n_counts', 'n_genes', 'percent_chrY', 'XIST-counts', 'doublet_scores', 'predicted_doublets', 'doublet_info'
var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'
uns: 'doublet_info_colors', 'hvg', 'log1p', 'neighbors', 'pca', 'sample_colors', 'tsne', 'umap'
obsm: 'X_pca', 'X_tsne', 'X_umap'
obsp: 'connectivities', 'distances',
'covid_17': View of AnnData object with n_obs × n_vars = 673 × 18752
obs: 'type', 'sample', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2', 'n_counts', 'n_genes', 'percent_chrY', 'XIST-counts', 'doublet_scores', 'predicted_doublets', 'doublet_info'
var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'
uns: 'doublet_info_colors', 'hvg', 'log1p', 'neighbors', 'pca', 'sample_colors', 'tsne', 'umap'
obsm: 'X_pca', 'X_tsne', 'X_umap'
obsp: 'connectivities', 'distances',
'ctrl_5': View of AnnData object with n_obs × n_vars = 1003 × 18752
obs: 'type', 'sample', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2', 'n_counts', 'n_genes', 'percent_chrY', 'XIST-counts', 'doublet_scores', 'predicted_doublets', 'doublet_info'
var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'
uns: 'doublet_info_colors', 'hvg', 'log1p', 'neighbors', 'pca', 'sample_colors', 'tsne', 'umap'
obsm: 'X_pca', 'X_tsne', 'X_umap'
obsp: 'connectivities', 'distances',
'ctrl_13': View of AnnData object with n_obs × n_vars = 1101 × 18752
obs: 'type', 'sample', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2', 'n_counts', 'n_genes', 'percent_chrY', 'XIST-counts', 'doublet_scores', 'predicted_doublets', 'doublet_info'
var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'
uns: 'doublet_info_colors', 'hvg', 'log1p', 'neighbors', 'pca', 'sample_colors', 'tsne', 'umap'
obsm: 'X_pca', 'X_tsne', 'X_umap'
obsp: 'connectivities', 'distances',
'ctrl_14': View of AnnData object with n_obs × n_vars = 957 × 18752
obs: 'type', 'sample', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2', 'n_counts', 'n_genes', 'percent_chrY', 'XIST-counts', 'doublet_scores', 'predicted_doublets', 'doublet_info'
var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'
uns: 'doublet_info_colors', 'hvg', 'log1p', 'neighbors', 'pca', 'sample_colors', 'tsne', 'umap'
obsm: 'X_pca', 'X_tsne', 'X_umap'
obsp: 'connectivities', 'distances'}
In [266]:
# create a new object with lognormalized counts
adata_combat = sc.AnnData(X=adata.raw.X, var=adata.raw.var, obs = adata.obs)
# first store the raw data
adata_combat.raw = adata_combat
# run combat
sc.pp.combat(adata_combat, key='sample')
Standardizing Data across genes. Found 6 batches Found 0 numerical variables: Found 69 genes with zero variance. Fitting L/S model and finding priors Finding parametric adjustments
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/scanpy/preprocessing/_combat.py:351: RuntimeWarning: invalid value encountered in divide (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max() /Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/scanpy/preprocessing/_combat.py:351: RuntimeWarning: divide by zero encountered in divide (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
Adjusting data
In [267]:
sc.pp.highly_variable_genes(adata_combat)
print("Highly variable genes: %d"%sum(adata_combat.var.highly_variable))
sc.pl.highly_variable_genes(adata_combat)
sc.pp.pca(adata_combat, n_comps=30, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata_combat, n_pcs =30)
sc.tl.umap(adata_combat)
sc.tl.tsne(adata_combat, n_pcs = 30)
extracting highly variable genes
finished (0:00:01)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
Highly variable genes: 3960
computing PCA
with n_comps=30
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/scanpy/preprocessing/_pca.py:377: FutureWarning: Argument `use_highly_variable` is deprecated, consider using the mask argument. Use_highly_variable=True can be called through mask_var="highly_variable". Use_highly_variable=False can be called through mask_var=None warn(msg, FutureWarning)
finished (0:00:00)
computing neighbors
using 'X_pca' with n_pcs = 30
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm)
'umap', UMAP parameters (adata.uns) (0:00:07)
computing tSNE
using 'X_pca' with n_pcs = 30
using sklearn.manifold.TSNE
finished: added
'X_tsne', tSNE coordinates (adata.obsm)
'tsne', tSNE parameters (adata.uns) (0:00:10)
In [268]:
# compare var_genes
var_genes_combat = adata_combat.var.highly_variable
print("With all data %d"%sum(var_genes_all))
print("With combat %d"%sum(var_genes_combat))
print("Overlap %d"%sum(var_genes_all & var_genes_combat))
print("With 2 batches %d"%sum(var_select))
print("Overlap %d"%sum(var_genes_combat & var_select))
With all data 3181 With combat 3960 Overlap 2463 With 2 batches 2024 Overlap 1375
In [269]:
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 2, figsize=(8,4), constrained_layout=True)
# Plot Combat-corrected tSNE
sc.pl.tsne(adata_combat, color="sample", title="Combat tSNE", ax=axs[0], show=False)
# Plot Combat-corrected UMAP
sc.pl.umap(adata_combat, color="sample", title="Combat UMAP", ax=axs[1], show=False)
plt.show()
In [270]:
import scanorama
#subset the individual dataset to the same variable genes as in MNN-correct.
alldata2 = dict()
for ds in alldata.keys():
print(ds)
alldata2[ds] = alldata[ds][:,var_genes]
#convert to list of AnnData objects
adatas = list(alldata2.values())
# run scanorama.integrate
scanorama.integrate_scanpy(adatas, dimred = 50)
covid_1 covid_15 covid_17 ctrl_5 ctrl_13 ctrl_14 Found 2024 genes among all datasets [[0. 0.8115942 0.2986627 0.67432432 0.52837838 0.43108108] [0. 0. 0.54830918 0.62560386 0.39855072 0.43478261] [0. 0. 0. 0.43090639 0.12927192 0.21991085] [0. 0. 0. 0. 0.90329013 0.7108674 ] [0. 0. 0. 0. 0. 0.85684431] [0. 0. 0. 0. 0. 0. ]] Processing datasets (3, 4) Processing datasets (4, 5) Processing datasets (0, 1) Processing datasets (3, 5) Processing datasets (0, 3) Processing datasets (1, 3) Processing datasets (1, 2) Processing datasets (0, 4) Processing datasets (1, 5) Processing datasets (0, 5) Processing datasets (2, 3) Processing datasets (1, 4) Processing datasets (0, 2) Processing datasets (2, 5) Processing datasets (2, 4)
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/scanorama/scanorama.py:284: ImplicitModificationWarning: Setting element `.obsm['X_scanorama']` of view, initializing view as actual. adata.obsm['X_scanorama'] = X_dimred
In [271]:
#scanorama adds the corrected matrix to adata.obsm in each of the datasets in adatas.
adatas[0].obsm['X_scanorama'].shape
Out[271]:
(740, 50)
In [272]:
# Get all the integrated matrices.
scanorama_int = [ad.obsm['X_scanorama'] for ad in adatas]
# make into one matrix.
all_s = np.concatenate(scanorama_int)
print(all_s.shape)
# add to the AnnData object
adata.obsm["Scanorama"] = all_s
(4888, 50)
In [273]:
# tsne and umap
sc.pp.neighbors(adata, n_pcs =50, use_rep = "Scanorama")
sc.tl.umap(adata)
sc.tl.tsne(adata, n_pcs = 50, use_rep = "Scanorama")
computing neighbors
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm)
'umap', UMAP parameters (adata.uns) (0:00:07)
computing tSNE
using sklearn.manifold.TSNE
finished: added
'X_tsne', tSNE coordinates (adata.obsm)
'tsne', tSNE parameters (adata.uns) (0:00:11)
In [274]:
fig, axs = plt.subplots(2, 2, figsize=(8,8),constrained_layout=True)
sc.pl.tsne(adata_combat, color="sample", title="Combat tsne", ax=axs[0,0], show=False)
sc.pl.tsne(adata, color="sample", title="Scanorama tsne", ax=axs[0,1], show=False)
sc.pl.umap(adata_combat, color="sample", title="Combat umap", ax=axs[1,0], show=False)
sc.pl.umap(adata, color="sample", title="Scanorama umap", ax=axs[1,1], show=False)
Out[274]:
<Axes: title={'center': 'Scanorama umap'}, xlabel='UMAP1', ylabel='UMAP2'>
In [275]:
## Save file into H5ad format
save_file = './data/results/scanpy_scanorama_corrected_covid.h5ad'
adata.write_h5ad(save_file)
clustering¶
In [276]:
sc.settings.set_figure_params(dpi=80)
In [277]:
# scanorama integration data.
save_file = './data/results/scanpy_scanorama_corrected_covid.h5ad'
adata = sc.read_h5ad(save_file)
In [278]:
adata
Out[278]:
AnnData object with n_obs × n_vars = 4888 × 3181
obs: 'type', 'sample', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2', 'n_counts', 'n_genes', 'percent_chrY', 'XIST-counts', 'doublet_scores', 'predicted_doublets', 'doublet_info'
var: 'gene_ids', 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'doublet_info_colors', 'hvg', 'log1p', 'neighbors', 'pca', 'sample_colors', 'tsne', 'umap'
obsm: 'Scanorama', 'X_pca', 'X_tsne', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
Graph CLustering¶
Leiden Clustering¶
In [279]:
sc.tl.leiden(adata, key_added = "leiden_1.0") # default resolution in 1.0
sc.tl.leiden(adata, resolution = 0.6, key_added = "leiden_0.6")
sc.tl.leiden(adata, resolution = 0.4, key_added = "leiden_0.4")
sc.tl.leiden(adata, resolution = 1.4, key_added = "leiden_1.4")
running Leiden clustering
finished: found 15 clusters and added
'leiden_1.0', the cluster labels (adata.obs, categorical) (0:00:02)
running Leiden clustering
finished: found 11 clusters and added
'leiden_0.6', the cluster labels (adata.obs, categorical) (0:00:01)
running Leiden clustering
finished: found 8 clusters and added
'leiden_0.4', the cluster labels (adata.obs, categorical) (0:00:01)
running Leiden clustering
finished: found 16 clusters and added
'leiden_1.4', the cluster labels (adata.obs, categorical) (0:00:02)
In [280]:
sc.pl.umap(adata, color=['leiden_0.4', 'leiden_0.6', 'leiden_1.0','leiden_1.4'])
In [281]:
sc.tl.dendrogram(adata, groupby = "leiden_0.6")
sc.pl.dendrogram(adata, groupby = "leiden_0.6")
genes = ["CD3E", "CD4", "CD8A", "GNLY","NKG7", "MS4A1","FCGR3A","CD14","LYZ","CST3","MS4A7","FCGR1A"]
sc.pl.dotplot(adata, genes, groupby='leiden_0.6', dendrogram=True)
using 'X_pca' with n_pcs = 50 Storing dendrogram info using `.uns['dendrogram_leiden_0.6']`
In [282]:
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 2, figsize=(8,4), constrained_layout=True)
# Plot Combat-corrected tSNE
sc.pl.tsne(adata_combat, color="sample", title="Combat tSNE", ax=axs[0], show=False)
# Plot Combat-corrected UMAP
sc.pl.umap(adata_combat, color="sample", title="Combat UMAP", ax=axs[1], show=False)
plt.show()
In [283]:
tmp = pd.crosstab(adata.obs['leiden_0.6'],adata.obs['type'], normalize='index')
tmp.plot.bar(stacked=True).legend(loc='upper right')
Out[283]:
<matplotlib.legend.Legend at 0x14a026b90>
Louvain Clustering¶
In [284]:
sc.tl.louvain(adata, key_added = "louvain_1.0") # default resolution in 1.0
sc.tl.louvain(adata, resolution = 0.6, key_added = "louvain_0.6")
sc.tl.louvain(adata, resolution = 0.4, key_added = "louvain_0.4")
sc.tl.louvain(adata, resolution = 1.4, key_added = "louvain_1.4")
sc.pl.umap(adata, color=['louvain_0.4', 'louvain_0.6', 'louvain_1.0','louvain_1.4'])
running Louvain clustering
using the "louvain" package of Traag (2017)
finished: found 13 clusters and added
'louvain_1.0', the cluster labels (adata.obs, categorical) (0:00:00)
running Louvain clustering
using the "louvain" package of Traag (2017)
finished: found 8 clusters and added
'louvain_0.6', the cluster labels (adata.obs, categorical) (0:00:00)
running Louvain clustering
using the "louvain" package of Traag (2017)
finished: found 7 clusters and added
'louvain_0.4', the cluster labels (adata.obs, categorical) (0:00:00)
running Louvain clustering
using the "louvain" package of Traag (2017)
finished: found 14 clusters and added
'louvain_1.4', the cluster labels (adata.obs, categorical) (0:00:00)
In [285]:
sc.tl.dendrogram(adata, groupby = "louvain_0.6")
sc.pl.dendrogram(adata, groupby = "louvain_0.6")
genes = ["CD3E", "CD4", "CD8A", "GNLY","NKG7", "MS4A1","FCGR3A","CD14","LYZ","CST3","MS4A7","FCGR1A"]
sc.pl.dotplot(adata, genes, groupby='louvain_0.6', dendrogram=True)
using 'X_pca' with n_pcs = 50 Storing dendrogram info using `.uns['dendrogram_louvain_0.6']`
In [286]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
K-Means CLustering¶
In [287]:
# extract pca coordinates
X_pca = adata.obsm['Scanorama']
# kmeans with k=5
kmeans = KMeans(n_clusters=5, random_state=0).fit(X_pca)
adata.obs['kmeans5'] = kmeans.labels_.astype(str)
# kmeans with k=10
kmeans = KMeans(n_clusters=10, random_state=0).fit(X_pca)
adata.obs['kmeans10'] = kmeans.labels_.astype(str)
# kmeans with k=15
kmeans = KMeans(n_clusters=15, random_state=0).fit(X_pca)
adata.obs['kmeans15'] = kmeans.labels_.astype(str)
sc.pl.umap(adata, color=['kmeans5', 'kmeans10', 'kmeans15'])
In [288]:
## Hierarchical clustering
In [289]:
from sklearn.cluster import AgglomerativeClustering
# Cluster with 5 clusters
cluster = AgglomerativeClustering(n_clusters=5, metric='euclidean', linkage='ward')
adata.obs['hclust_5'] = cluster.fit_predict(X_pca).astype(str)
# Cluster with 10 clusters
cluster = AgglomerativeClustering(n_clusters=10, metric='euclidean', linkage='ward')
adata.obs['hclust_10'] = cluster.fit_predict(X_pca).astype(str)
# Cluster with 15 clusters
cluster = AgglomerativeClustering(n_clusters=15, metric='euclidean', linkage='ward')
adata.obs['hclust_15'] = cluster.fit_predict(X_pca).astype(str)
# Plot UMAP with clusters
sc.pl.umap(adata, color=['hclust_5', 'hclust_10', 'hclust_15'])
In [290]:
## Saving the data in a h5ad format
adata.write_h5ad('./data/results/scanpy_clustered_covid.h5ad')
Differential Expression Analsysis¶
In [291]:
import gseapy
In [292]:
adata = sc.read_h5ad('./data/results/scanpy_clustered_covid.h5ad')
In [293]:
print(adata.X.shape)
print(adata.raw.X.shape)
print(adata.raw.X[:10,:10])
(4888, 3181) (4888, 18752) <Compressed Sparse Row sparse matrix of dtype 'float64' with 5 stored elements and shape (10, 10)> Coords Values (0, 1) 0.9043687482537638 (0, 6) 0.9043687482537638 (1, 6) 0.9678402572038912 (2, 6) 0.5124039238100646 (8, 1) 0.735933317095064
In [294]:
adata = adata.raw.to_adata()
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.log1p(adata)
normalizing by total count per cell
finished (0:00:00): normalized adata.X and added
'n_counts', counts per cell before normalization (adata.obs)
WARNING: adata.X seems to be already log-transformed.
T-test¶
In [295]:
sc.tl.rank_genes_groups(adata, 'louvain_0.6', method='t-test', key_added = "t-test")
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, key = "t-test")
ranking genes
finished: added to `.uns['t-test']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:00)
T- test Overestimated¶
In [296]:
sc.tl.rank_genes_groups(adata, 'louvain_0.6', method='t-test_overestim_var', key_added = "t-test_ov")
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, key= "t-test_ov")
ranking genes
finished: added to `.uns['t-test_ov']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:00)
Wilcoxon rank-sum¶
In [297]:
sc.tl.rank_genes_groups(adata, 'louvain_0.6', method='wilcoxon', key_added= "wilcoxon")
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, key="wilcoxon")
ranking genes
finished: added to `.uns['wilcoxon']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:03)
Logisttic Regression test¶
In [298]:
sc.tl.rank_genes_groups(adata, 'louvain_0.6', method='logreg',key_added = "logreg")
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, key = "logreg")
ranking genes
finished: added to `.uns['logreg']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
(0:00:08)
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
Comparing Genes¶
compare DE gnes for cluster 0 with each test¶
In [299]:
wc = sc.get.rank_genes_groups_df(adata, group='0', key='wilcoxon', pval_cutoff=0.01, log2fc_min=0)['names']
tt = sc.get.rank_genes_groups_df(adata, group='0', key='t-test', pval_cutoff=0.01, log2fc_min=0)['names']
tt_ov = sc.get.rank_genes_groups_df(adata, group='0', key='t-test_ov', pval_cutoff=0.01, log2fc_min=0)['names']
In [300]:
from matplotlib_venn import venn3
venn3([set(wc),set(tt),set(tt_ov)], ('Wilcox','T-test','T-test_ov') )
plt.show()
HEAT MAP and Visulization¶
In [301]:
sc.pl.rank_genes_groups_heatmap(adata, n_genes=5, key="wilcoxon", groupby="louvain_0.6", show_gene_labels=True)
In [302]:
## DOT Plot
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, key="wilcoxon", groupby="louvain_0.6")
In [303]:
### Stacked Violin Plot
sc.pl.rank_genes_groups_stacked_violin(adata, n_genes=5, key="wilcoxon", groupby="louvain_0.6")
In [304]:
## Matrix PLot
sc.pl.rank_genes_groups_matrixplot(adata, n_genes=5, key="wilcoxon", groupby="louvain_0.6")
specific clusters Analysis¶
In [305]:
sc.tl.rank_genes_groups(adata, 'louvain_0.6', groups=['1'], reference='2', method='wilcoxon')
sc.pl.rank_genes_groups(adata, groups=['1'], n_genes=20)
ranking genes
finished: added to `.uns['rank_genes_groups']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:01)
In [306]:
sc.pl.rank_genes_groups_violin(adata, groups='1', n_genes=10)
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/scanpy/plotting/_tools/__init__.py:1320: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. _ax.set_xticklabels(new_gene_names, rotation="vertical")
In [307]:
# convert numpy.recarray to list
mynames = [x[0] for x in adata.uns['rank_genes_groups']['names'][:10]]
sc.pl.stacked_violin(adata, mynames, groupby = 'louvain_0.6')
Differential Expression across various Conditions¶
In [308]:
cl1 = adata[adata.obs['louvain_0.6'] =='1',:]
cl1.obs['type'].value_counts()
Out[308]:
type Ctrl 642 Covid 291 Name: count, dtype: int64
In [309]:
sc.tl.rank_genes_groups(cl1, 'type', method='wilcoxon', key_added = "wilcoxon")
sc.pl.rank_genes_groups(cl1, n_genes=25, sharey=False, key = "wilcoxon")
ranking genes
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/scanpy/tools/_rank_genes_groups.py:645: ImplicitModificationWarning: Trying to modify attribute `._uns` of view, initializing view as actual.
adata.uns[key_added] = {}
finished: added to `.uns['wilcoxon']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:01)
In [310]:
## Ranking genes (violin plots )
sc.pl.rank_genes_groups_violin(cl1, n_genes=10, key="wilcoxon")
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/scanpy/plotting/_tools/__init__.py:1320: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. _ax.set_xticklabels(new_gene_names, rotation="vertical")
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/scanpy/plotting/_tools/__init__.py:1320: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. _ax.set_xticklabels(new_gene_names, rotation="vertical")
CATplot¶
In [311]:
genes1 = sc.get.rank_genes_groups_df(cl1, group='Covid', key='wilcoxon')['names'][:5]
genes2 = sc.get.rank_genes_groups_df(cl1, group='Ctrl', key='wilcoxon')['names'][:5]
genes = genes1.tolist() + genes2.tolist()
df_marker = sc.get.obs_df(adata, genes + ['louvain_0.6','type'], use_raw=False)
df2 = df_marker .melt(id_vars=["louvain_0.6",'type'], value_vars=genes)
In [312]:
sns.catplot(x = "louvain_0.6", y = "value", hue = "type", kind = 'violin',
col = "variable", data = df2, col_wrap=4, inner=None)
Out[312]:
<seaborn.axisgrid.FacetGrid at 0x13c98d850>
Gene Set Analysis¶
In [313]:
# Fetch the gene set names
gene_set_names = gseapy.get_library_name(organism='Human')
# Print each gene set name
for name in gene_set_names:
print(name)
ARCHS4_Cell-lines ARCHS4_IDG_Coexp ARCHS4_Kinases_Coexp ARCHS4_TFs_Coexp ARCHS4_Tissues Achilles_fitness_decrease Achilles_fitness_increase Aging_Perturbations_from_GEO_down Aging_Perturbations_from_GEO_up Allen_Brain_Atlas_10x_scRNA_2021 Allen_Brain_Atlas_down Allen_Brain_Atlas_up Azimuth_2023 Azimuth_Cell_Types_2021 BioCarta_2013 BioCarta_2015 BioCarta_2016 BioPlanet_2019 BioPlex_2017 CCLE_Proteomics_2020 CORUM COVID-19_Related_Gene_Sets COVID-19_Related_Gene_Sets_2021 Cancer_Cell_Line_Encyclopedia CellMarker_2024 CellMarker_Augmented_2021 ChEA_2013 ChEA_2015 ChEA_2016 ChEA_2022 Chromosome_Location Chromosome_Location_hg19 ClinVar_2019 DGIdb_Drug_Targets_2024 DSigDB Data_Acquisition_Method_Most_Popular_Genes DepMap_CRISPR_GeneDependency_CellLines_2023 DepMap_WG_CRISPR_Screens_Broad_CellLines_2019 DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019 Descartes_Cell_Types_and_Tissue_2021 Diabetes_Perturbations_GEO_2022 DisGeNET Disease_Perturbations_from_GEO_down Disease_Perturbations_from_GEO_up Disease_Signatures_from_GEO_down_2014 Disease_Signatures_from_GEO_up_2014 DrugMatrix Drug_Perturbations_from_GEO_2014 Drug_Perturbations_from_GEO_down Drug_Perturbations_from_GEO_up ENCODE_Histone_Modifications_2013 ENCODE_Histone_Modifications_2015 ENCODE_TF_ChIP-seq_2014 ENCODE_TF_ChIP-seq_2015 ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X ESCAPE Elsevier_Pathway_Collection Enrichr_Libraries_Most_Popular_Genes Enrichr_Submissions_TF-Gene_Coocurrence Enrichr_Users_Contributed_Lists_2020 Epigenomics_Roadmap_HM_ChIP-seq FANTOM6_lncRNA_KD_DEGs GO_Biological_Process_2013 GO_Biological_Process_2015 GO_Biological_Process_2017 GO_Biological_Process_2017b GO_Biological_Process_2018 GO_Biological_Process_2021 GO_Biological_Process_2023 GO_Cellular_Component_2013 GO_Cellular_Component_2015 GO_Cellular_Component_2017 GO_Cellular_Component_2017b GO_Cellular_Component_2018 GO_Cellular_Component_2021 GO_Cellular_Component_2023 GO_Molecular_Function_2013 GO_Molecular_Function_2015 GO_Molecular_Function_2017 GO_Molecular_Function_2017b GO_Molecular_Function_2018 GO_Molecular_Function_2021 GO_Molecular_Function_2023 GTEx_Aging_Signatures_2021 GTEx_Tissue_Expression_Down GTEx_Tissue_Expression_Up GTEx_Tissues_V8_2023 GWAS_Catalog_2019 GWAS_Catalog_2023 GeDiPNet_2023 GeneSigDB Gene_Perturbations_from_GEO_down Gene_Perturbations_from_GEO_up Genes_Associated_with_NIH_Grants Genome_Browser_PWMs GlyGen_Glycosylated_Proteins_2022 HDSigDB_Human_2021 HDSigDB_Mouse_2021 HMDB_Metabolites HMS_LINCS_KinomeScan HomoloGene HuBMAP_ASCT_plus_B_augmented_w_RNAseq_Coexpression HuBMAP_ASCTplusB_augmented_2022 HumanCyc_2015 HumanCyc_2016 Human_Gene_Atlas Human_Phenotype_Ontology IDG_Drug_Targets_2022 InterPro_Domains_2019 Jensen_COMPARTMENTS Jensen_DISEASES Jensen_TISSUES KEA_2013 KEA_2015 KEGG_2013 KEGG_2015 KEGG_2016 KEGG_2019_Human KEGG_2019_Mouse KEGG_2021_Human KOMP2_Mouse_Phenotypes_2022 Kinase_Perturbations_from_GEO_down Kinase_Perturbations_from_GEO_up L1000_Kinase_and_GPCR_Perturbations_down L1000_Kinase_and_GPCR_Perturbations_up LINCS_L1000_CRISPR_KO_Consensus_Sigs LINCS_L1000_Chem_Pert_Consensus_Sigs LINCS_L1000_Chem_Pert_down LINCS_L1000_Chem_Pert_up LINCS_L1000_Ligand_Perturbations_down LINCS_L1000_Ligand_Perturbations_up Ligand_Perturbations_from_GEO_down Ligand_Perturbations_from_GEO_up MAGMA_Drugs_and_Diseases MAGNET_2023 MCF7_Perturbations_from_GEO_down MCF7_Perturbations_from_GEO_up MGI_Mammalian_Phenotype_2013 MGI_Mammalian_Phenotype_2017 MGI_Mammalian_Phenotype_Level_3 MGI_Mammalian_Phenotype_Level_4 MGI_Mammalian_Phenotype_Level_4_2019 MGI_Mammalian_Phenotype_Level_4_2021 MGI_Mammalian_Phenotype_Level_4_2024 MSigDB_Computational MSigDB_Hallmark_2020 MSigDB_Oncogenic_Signatures Metabolomics_Workbench_Metabolites_2022 Microbe_Perturbations_from_GEO_down Microbe_Perturbations_from_GEO_up MoTrPAC_2023 Mouse_Gene_Atlas NCI-60_Cancer_Cell_Lines NCI-Nature_2016 NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions NIH_Funded_PIs_2017_Human_AutoRIF NIH_Funded_PIs_2017_Human_GeneRIF NURSA_Human_Endogenous_Complexome OMIM_Disease OMIM_Expanded Old_CMAP_down Old_CMAP_up Orphanet_Augmented_2021 PFOCR_Pathways PFOCR_Pathways_2023 PPI_Hub_Proteins PanglaoDB_Augmented_2021 Panther_2015 Panther_2016 Pfam_Domains_2019 Pfam_InterPro_Domains PheWeb_2019 PhenGenI_Association_2021 Phosphatase_Substrates_from_DEPOD ProteomicsDB_2020 Proteomics_Drug_Atlas_2023 RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO RNAseq_Automatic_GEO_Signatures_Human_Down RNAseq_Automatic_GEO_Signatures_Human_Up RNAseq_Automatic_GEO_Signatures_Mouse_Down RNAseq_Automatic_GEO_Signatures_Mouse_Up Rare_Diseases_AutoRIF_ARCHS4_Predictions Rare_Diseases_AutoRIF_Gene_Lists Rare_Diseases_GeneRIF_ARCHS4_Predictions Rare_Diseases_GeneRIF_Gene_Lists Reactome_2013 Reactome_2015 Reactome_2016 Reactome_2022 Rummagene_kinases Rummagene_signatures Rummagene_transcription_factors SILAC_Phosphoproteomics SubCell_BarCode SynGO_2022 SynGO_2024 SysMyo_Muscle_Gene_Sets TF-LOF_Expression_from_GEO TF_Perturbations_Followed_by_Expression TG_GATES_2020 TRANSFAC_and_JASPAR_PWMs TRRUST_Transcription_Factors_2019 Table_Mining_of_CRISPR_Studies Tabula_Muris Tabula_Sapiens TargetScan_microRNA TargetScan_microRNA_2017 The_Kinase_Library_2023 The_Kinase_Library_2024 Tissue_Protein_Expression_from_Human_Proteome_Map Tissue_Protein_Expression_from_ProteomicsDB Transcription_Factor_PPIs UK_Biobank_GWAS_v1 Virus-Host_PPI_P-HIPSTer_2020 VirusMINT Virus_Perturbations_from_GEO_down Virus_Perturbations_from_GEO_up WikiPathway_2021_Human WikiPathway_2023_Human WikiPathways_2013 WikiPathways_2015 WikiPathways_2016 WikiPathways_2019_Human WikiPathways_2019_Mouse WikiPathways_2024_Human WikiPathways_2024_Mouse dbGaP huMAP lncHUB_lncRNA_Co-Expression miRTarBase_2017
In [314]:
glist = sc.get.rank_genes_groups_df(cl1, group='Covid',
key='wilcoxon',
log2fc_min=0.25,
pval_cutoff=0.01)['names'].squeeze().str.strip().tolist()
print(len(glist))
185
In [315]:
enr_res = gseapy.enrichr(gene_list=glist,
organism='Human',
gene_sets='GO_Biological_Process_2018',
cutoff = 0.5)
In [316]:
enr_res.results.head()
Out[316]:
| Gene_set | Term | Overlap | P-value | Adjusted P-value | Old P-value | Old Adjusted P-value | Odds Ratio | Combined Score | Genes | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | GO_Biological_Process_2018 | cellular response to type I interferon (GO:007... | 11/65 | 1.824009e-11 | 1.177398e-08 | 0 | 0 | 23.134419 | 572.054017 | BST2;ISG20;IFITM1;SP100;MX1;IFI6;ISG15;IFI35;X... |
| 1 | GO_Biological_Process_2018 | type I interferon signaling pathway (GO:0060337) | 11/65 | 1.824009e-11 | 1.177398e-08 | 0 | 0 | 23.134419 | 572.054017 | BST2;ISG20;IFITM1;SP100;MX1;IFI6;ISG15;IFI35;X... |
| 2 | GO_Biological_Process_2018 | cytokine-mediated signaling pathway (GO:0019221) | 27/633 | 3.509208e-11 | 1.510129e-08 | 0 | 0 | 5.416750 | 130.397675 | IFITM1;SP100;IFI6;IFI35;IFIT3;PSMA7;MT2A;SOCS1... |
| 3 | GO_Biological_Process_2018 | antigen receptor-mediated signaling pathway (G... | 15/257 | 1.778104e-08 | 5.738830e-06 | 0 | 0 | 7.136485 | 127.351529 | TRAC;THEMIS;CD3G;CD3D;PSMA7;PSMB8;PSMB9;NFKBIA... |
| 4 | GO_Biological_Process_2018 | T cell receptor signaling pathway (GO:0050852) | 12/163 | 3.951256e-08 | 1.020214e-05 | 0 | 0 | 9.032959 | 153.981673 | NFKBIA;PSMC5;LCK;TRAC;TRBC1;THEMIS;PSME2;CD3G;... |
In [317]:
# Barplot for scanning GSEA
gseapy.barplot(enr_res.res2d,title='GO_Biological_Process_2018')
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/gseapy/plot.py:694: FutureWarning: The 'method' keyword in Series.replace is deprecated and will be removed in a future version.
df[self.colname].replace(
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/gseapy/plot.py:694: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df[self.colname].replace(
Out[317]:
<Axes: title={'center': 'GO_Biological_Process_2018'}, xlabel='$- \\log_{10}$ (Adjusted P-value)'>
Enrichment - GSEA¶
In [318]:
# Get the gene ranking data for the group 'Covid' using Wilcoxon test
gene_rank = sc.get.rank_genes_groups_df(cl1, group='Covid', key='wilcoxon')[['names', 'logfoldchanges']]
# Sort the genes by log fold change in descending order
gene_rank.sort_values(by=['logfoldchanges'], inplace=True, ascending=False)
In [319]:
# Calculate QC metrics for the cells in cl1
sc.pp.calculate_qc_metrics(cl1, percent_top=None, log1p=False, inplace=True)
# Filter out genes that are expressed in fewer than 35 cells
gene_rank = gene_rank[gene_rank['names'].isin(cl1.var_names[cl1.var.n_cells_by_counts > 30])]
In [320]:
# Gene ranking based on log fold change
gene_rank
Out[320]:
| names | logfoldchanges | |
|---|---|---|
| 0 | RPS4Y1 | 31.352980 |
| 35 | DDX3Y | 29.129417 |
| 89 | EIF1AY | 28.553265 |
| 196 | TTTY15 | 28.093138 |
| 309 | UTY | 27.709085 |
| ... | ... | ... |
| 18439 | AC004556.1 | -27.532579 |
| 18263 | LGALS2 | -27.686165 |
| 18447 | TMEM176B | -27.858284 |
| 18554 | MPEG1 | -27.994612 |
| 18515 | IFI30 | -28.013903 |
8800 rows × 2 columns
In [321]:
# Perform GSEA prerank analysis using KEGG 2021 Human gene sets
res = gseapy.prerank(rnk=gene_rank.set_index('names')['logfoldchanges'], gene_sets='KEGG_2021_Human')
In [322]:
# Extract the top terms from the GSEA results
terms = res.res2d.index[:20]
terms
Out[322]:
RangeIndex(start=0, stop=20, step=1)
In [323]:
# # Plot the results for the top-ranked term
# if len(terms) > 0:
# gseapy.gseaplot(rank_metric=res.ranking, term=terms[0], **res.results[terms[0]])
# else:
# print("No significant terms found for plotting.")
In [324]:
adata.write_h5ad('./data/results/scanpy_DGE_covid.h5ad')
In [325]:
sc.settings.set_figure_params(dpi=80)
In [326]:
adata = sc.read_h5ad('./data/results/scanpy_clustered_covid.h5ad')
In [327]:
print(adata.shape)
print(adata.raw.shape)
(4888, 3181) (4888, 18752)
In [328]:
# Subset for ctrl_13 sample.
adata = adata[adata.obs["sample"] == "ctrl_13",:]
print(adata.shape)
(1101, 3181)
In [329]:
sc.pl.umap(adata, color=["louvain_0.6"], palette= sc.pl.palettes.default_20)
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/scanpy/plotting/_utils.py:487: ImplicitModificationWarning: Trying to modify attribute `._uns` of view, initializing view as actual. adata.uns[value_to_plot + "_colors"] = colors_list
In [330]:
# load reference data
adata_ref = sc.datasets.pbmc3k_processed()
In [331]:
adata_ref.obs['sample']='pbmc3k'
print(adata_ref.shape)
adata_ref.obs
(2638, 1838)
Out[331]:
| n_genes | percent_mito | n_counts | louvain | sample | |
|---|---|---|---|---|---|
| index | |||||
| AAACATACAACCAC-1 | 781 | 0.030178 | 2419.0 | CD4 T cells | pbmc3k |
| AAACATTGAGCTAC-1 | 1352 | 0.037936 | 4903.0 | B cells | pbmc3k |
| AAACATTGATCAGC-1 | 1131 | 0.008897 | 3147.0 | CD4 T cells | pbmc3k |
| AAACCGTGCTTCCG-1 | 960 | 0.017431 | 2639.0 | CD14+ Monocytes | pbmc3k |
| AAACCGTGTATGCG-1 | 522 | 0.012245 | 980.0 | NK cells | pbmc3k |
| ... | ... | ... | ... | ... | ... |
| TTTCGAACTCTCAT-1 | 1155 | 0.021104 | 3459.0 | CD14+ Monocytes | pbmc3k |
| TTTCTACTGAGGCA-1 | 1227 | 0.009294 | 3443.0 | B cells | pbmc3k |
| TTTCTACTTCCTCG-1 | 622 | 0.021971 | 1684.0 | B cells | pbmc3k |
| TTTGCATGAGAGGC-1 | 454 | 0.020548 | 1022.0 | B cells | pbmc3k |
| TTTGCATGCCTCAC-1 | 724 | 0.008065 | 1984.0 | CD4 T cells | pbmc3k |
2638 rows × 5 columns
In [332]:
sc.pl.umap(adata_ref, color='louvain')
In [333]:
print(adata_ref.shape[1])
print(adata.shape[1])
var_names = adata_ref.var_names.intersection(adata.var_names)
print(len(var_names))
1838 3181 525
In [334]:
adata_ref = adata_ref[:, var_names]
adata = adata[:, var_names]
In [335]:
sc.pp.pca(adata_ref)
sc.pp.neighbors(adata_ref)
sc.tl.umap(adata_ref)
sc.pl.umap(adata_ref, color='louvain')
computing PCA
with n_comps=50
finished (0:00:00)
computing neighbors
using 'X_pca' with n_pcs = 50
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
computing UMAP
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/scanpy/preprocessing/_pca.py:317: ImplicitModificationWarning: Setting element `.obsm['X_pca']` of view, initializing view as actual. adata.obsm[key_obsm] = X_pca
finished: added
'X_umap', UMAP coordinates (adata.obsm)
'umap', UMAP parameters (adata.uns) (0:00:04)
In [336]:
sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.pl.umap(adata, color='louvain_0.6')
computing PCA
with n_comps=50
finished (0:00:00)
computing neighbors
using 'X_pca' with n_pcs = 50
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
computing UMAP
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/scanpy/preprocessing/_pca.py:317: ImplicitModificationWarning: Setting element `.obsm['X_pca']` of view, initializing view as actual. adata.obsm[key_obsm] = X_pca
finished: added
'X_umap', UMAP coordinates (adata.obsm)
'umap', UMAP parameters (adata.uns) (0:00:02)
Integrating with Sanoma¶
In [337]:
import scanorama
#subset the individual dataset to the same variable genes as in MNN-correct.
alldata = dict()
alldata['ctrl']=adata
alldata['ref']=adata_ref
#convert to list of AnnData objects
adatas = list(alldata.values())
# run scanorama.integrate
scanorama.integrate_scanpy(adatas, dimred = 50)
Found 525 genes among all datasets [[0. 0.96548592] [0. 0. ]] Processing datasets (0, 1)
In [338]:
# add in sample info
adata_ref.obs['sample']='pbmc3k'
In [339]:
# create a merged scanpy object and add in the scanorama
adata_merged = alldata['ctrl'].concatenate(alldata['ref'], batch_key='sample', batch_categories=['ctrl','pbmc3k'])
embedding = np.concatenate([ad.obsm['X_scanorama'] for ad in adatas], axis=0)
adata_merged.obsm['Scanorama'] = embedding
/var/folders/mn/jc4_tkl910gcsv_h9ddxdzqr0000gn/T/ipykernel_82400/2358018963.py:2: FutureWarning: Use anndata.concat instead of AnnData.concatenate, AnnData.concatenate is deprecated and will be removed in the future. See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html adata_merged = alldata['ctrl'].concatenate(alldata['ref'], batch_key='sample', batch_categories=['ctrl','pbmc3k'])
In [340]:
#run umap.
sc.pp.neighbors(adata_merged, n_pcs =50, use_rep = "Scanorama")
sc.tl.umap(adata_merged)
computing neighbors
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm)
'umap', UMAP parameters (adata.uns) (0:00:06)
In [341]:
sc.pl.umap(adata_merged, color=["sample","louvain"])
modifying labels¶
In [342]:
from sklearn.metrics.pairwise import cosine_distances
distances = 1 - cosine_distances(
adata_merged[adata_merged.obs['sample'] == "pbmc3k"].obsm["Scanorama"],
adata_merged[adata_merged.obs['sample'] == "ctrl"].obsm["Scanorama"],
)
In [343]:
# def label_transfer(dist, labels, index):
# lab = pd.get_dummies(labels)
# class_prob = lab.to_numpy().T @ dist
# norm = np.linalg.norm(class_prob, 2, axis=0)
# class_prob = class_prob / norm
# class_prob = (class_prob.T - class_prob.min(1)) / class_prob.ptp(1)
# # convert to df
# cp_df = pd.DataFrame(
# class_prob, columns=lab.columns
# )
# cp_df.index = index
# # classify as max score
# m = cp_df.idxmax(axis=1)
In [344]:
def label_transfer(dist, labels, index):
# Compute the class probabilities
class_prob = 1 / (1 + dist)
# Normalize
norm = np.linalg.norm(class_prob, 2, axis=0)
class_prob = class_prob / norm
# Replace `class_prob.ptp(1)` with `np.ptp(class_prob, axis=1)`
class_prob = (class_prob.T - class_prob.min(1)) / np.ptp(class_prob, axis=1)
# Convert to dataframe
cp_df = pd.DataFrame(class_prob, columns=labels, index=index)
# Find the most likely class for each observation
class_def = cp_df.idxmax(1)
return class_def
# Run label transfer
class_def = label_transfer(distances, adata_ref.obs.louvain, adata.obs.index)
# Add the predicted class to adata.obs
adata.obs['predicted'] = class_def
In [345]:
sc.pl.umap(adata, color="predicted")
In [346]:
# Concatenate the predicted class with the louvain labels and convert to a list
class_def_combined = pd.concat([class_def, adata_ref.obs['louvain']]).tolist()
# Add this combined list to the merged object
adata_merged.obs['predicted'] = class_def_combined
# Plot UMAP for the merged data
sc.pl.umap(adata_merged, color=["sample", "louvain", "predicted"])
# Plot only for ctrl cells
sc.pl.umap(adata_merged[adata_merged.obs['sample'] == 'ctrl'], color='predicted')
INGEST - integration¶
In [347]:
sc.tl.ingest(adata, adata_ref, obs='louvain')
running ingest
/Users/zeus_10/Desktop/Projects/Single cell_scanpy/single_cell_covid/.venv/lib/python3.11/site-packages/umap/umap_.py:1945: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
finished (0:00:09)
In [348]:
# RUNNING umap of Injested sample
sc.pl.umap(adata, color=['louvain','louvain_0.6'], wspace=0.5)
In [349]:
## Comparing the results
sc.pl.umap(adata, color=['louvain','predicted'], wspace=0.5)
In [365]:
df_marker = pd.read_excel('./data/Cell_marker_Human.xlsx')
In [366]:
df_marker
Out[366]:
| species | tissue_class | tissue_type | uberonongology_id | cancer_type | cell_type | cell_name | cellontology_id | marker | Symbol | GeneID | Genetype | Genename | UNIPROTID | technology_seq | marker_source | PMID | Title | journal | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Human | Abdomen | Abdomen | UBERON_0000916 | Normal | Normal cell | Macrophage | CL_0000235 | MERTK | MERTK | 10461.0 | protein_coding | MER proto-oncogene, tyrosine kinase | Q12866 | NaN | Experiment | 31982413.0 | Peritoneal Level of CD206 Associates With Mort... | Gastroenterology | 2020.0 |
| 1 | Human | Abdomen | Abdomen | UBERON_0000916 | Normal | Normal cell | Macrophage | CL_0000235 | CD16 | FCGR3A | 2215.0 | protein_coding | Fc fragment of IgG receptor IIIb | O75015 | NaN | Experiment | 31982413.0 | Peritoneal Level of CD206 Associates With Mort... | Gastroenterology | 2020.0 |
| 2 | Human | Abdomen | Abdomen | UBERON_0000916 | Normal | Normal cell | Macrophage | CL_0000235 | CD206 | MRC1 | 4360.0 | protein_coding | mannose receptor C-type 1 | P22897 | NaN | Experiment | 31982413.0 | Peritoneal Level of CD206 Associates With Mort... | Gastroenterology | 2020.0 |
| 3 | Human | Abdomen | Abdomen | UBERON_0000916 | Normal | Normal cell | Macrophage | CL_0000235 | CRIg | VSIG4 | 11326.0 | protein_coding | V-set and immunoglobulin domain containing 4 | Q9Y279 | NaN | Experiment | 31982413.0 | Peritoneal Level of CD206 Associates With Mort... | Gastroenterology | 2020.0 |
| 4 | Human | Abdomen | Abdomen | UBERON_0000916 | Normal | Normal cell | Macrophage | CL_0000235 | CD163 | CD163 | 9332.0 | protein_coding | CD163 molecule | Q86VB7 | NaN | Experiment | 31982413.0 | Peritoneal Level of CD206 Associates With Mort... | Gastroenterology | 2020.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 60872 | Human | Vein | Vein | UBERON_0001638 | Normal | Normal cell | Venous cell | NaN | LRG1 | LRG1 | 116844.0 | protein_coding | leucine rich alpha-2-glycoprotein 1 | P02750 | 10x Chromium | Experiment | 33996252.0 | Single-cell transcriptomics reveals the landsc... | Molecular therapy. Nucleic acids | 2021.0 |
| 60873 | Human | Vein | Vein | UBERON_0001638 | Normal | Normal cell | Venous cell | NaN | EMCN | EMCN | 51705.0 | protein_coding | endomucin | Q9ULC0 | 10x Chromium | Experiment | 33996252.0 | Single-cell transcriptomics reveals the landsc... | Molecular therapy. Nucleic acids | 2021.0 |
| 60874 | Human | Vein | Vein | UBERON_0001638 | Normal | Normal cell | Systemic–venous endothelial cell | NaN | COL15A1 | COL15A1 | 1306.0 | protein_coding | collagen type XV alpha 1 chain | B3KTP7 | 10x Chromium | Experiment | 34030460.0 | Integrated Single-Cell Atlas of Endothelial Ce... | Circulation | 2021.0 |
| 60875 | Human | NaN | Intestine | UBERON_0000160 | Normal | Normal cell | B cell | CL_0000236 | CD19 | CD19 | 930.0 | protein_coding | CD19 molecule | P15391 | 10x Chromium | Experiment | 34269788.0 | CD16+CD163+ monocytes traffic to sites of infl... | The Journal of experimental medicine | 2021.0 |
| 60876 | Human | NaN | Intestine | UBERON_0000160 | Normal | Normal cell | B cell | CL_0000236 | CD19 | CD19 | 930.0 | protein_coding | CD19 molecule | P15391 | 10x Chromium | Experiment | 34875227.0 | Stem-like intestinal Th17 cells give rise to p... | Cell | 2021.0 |
60877 rows × 20 columns
In [352]:
df_marker.shape
Out[352]:
(60877, 20)
In [369]:
# Filter for number of genes.
print(df_marker.shape)
df_marker['nG'] = df.Symbol.str.split(",").str.len()
df_marker.shape
(60877, 21)
Out[369]:
(60877, 21)
In [368]:
(0, 21)
In [ ]: